\documentclass[12pt,journal,compsoc,onecolumn]{./res/IEEEtran}

\usepackage[utf8]{inputenc}
\usepackage[nocompress]{cite}
\usepackage[pdftex]{graphicx}
\graphicspath{{./res/}}
\DeclareGraphicsExtensions{.pdf,.jpg,.png}
\usepackage{fixltx2e}
\usepackage{url}
\usepackage{float}
\usepackage[justification=centering]{caption}
\usepackage{hyperref}
\usepackage{listings}
\usepackage{verbatim}

\hyphenation{cycling asso-ciate para-lle-lism learning}

\begin{document}
\title{Automatic Natural Language Classification using Distributed SOM Learning}

\author{Ayala,~R.,~\href{mailto:ricardo.ayala@oracle.com}{ricardo.ayala@oracle.com},~\IEEEmembership{~Oracle}
Bahena,~D.,~\href{mailto:dario.bahena@oracle.com}{dario.bahena@oracle.com},~\IEEEmembership{~Oracle}
        Rodríguez,~L.,~\href{mailto:luis.g.rodriguez@oracle.com}{luis.g.rodriguez@oracle.com},~\IEEEmembership{~Oracle}        
        Zavaleta,~R.,~\href{mailto:ricardo.zavaleta@oracle.com}{ricardo.zavaleta@oracle.com},~\IEEEmembership{~Oracle}}%

\markboth{MSC. IN COMPUTER SCIENCE, CINVESTAV-ORACLE,  BIG-DATA, APRIL 2014}%
{}

\IEEEtitleabstractindextext{
  \begin{abstract}
    Word syntactic categorization and automatic text classification by
    topic, are the two initial problems we want to attack in the course. The
    particular machine learning technique we are initially interesting
    in, is that of Self-Organizing Maps (particularly, the
    offline variant of its learning algorithm); the idea is to either
    enhance or compare this technique with the others we will review
    during the course. Our programming language choice
    is Python, in order to leverage RAD and existing open-source tools. The
    distributed collection framework Spark was preferred over Hadoop,
    as it allows a more natural implementation of the iterative
    learning algorithm (as opposed to Hadoop batch
    paradigm\footnote{Worth to mention that Spark can be considered a
    upper layer of Hadoop, as it is capable of operating collections
    based on HDFS files.}). Additional tools being considered are the Natural
    Language ToolKit (NLTK) and  Scientific Computing library
    NumPy. Besides the abundance of software frameworks and tools, the
    problem of natural language was selected given the great
    diversity and abundance of data sources (eg. Gutemberg Project, Wikipedia,
    Twitter, Blogs, Phorums, etc); preference will be given to sources which
    are self-classified already (to allow us compare the
    accuracy of our results, with simple coloring visualization of the SOM).  
  \end{abstract}
}

\maketitle
\input{01-the-problems.tex}
\input{02-machlearn-tools.tex}
\input{03-software-tools.tex}
\input{04-data-sources.tex}
\input{05-biblio.tex}
\input{06-authors-bios.tex}
\end{document}

